knitr::opts_chunk$set(
  warning = TRUE, # show warnings during codebook generation
  message = TRUE, # show messages during codebook generation
  error = TRUE, # do not interrupt codebook generation in case of errors,
                # usually better for debugging
  echo = TRUE  # show R code
)
ggplot2::theme_set(ggplot2::theme_bw())
pander::panderOptions("table.split.table", Inf)
# load libraries
library(codebook)
library(here)
library(dplyr)
library(tidyverse)
library(future)
library(labelled)

This is a data dictionary for the data set used in the paper “Patterns of language switching and bilingual childrens word learning: An experiment across two communities”.

#load dataset
data_clean <- read.csv(here("analysis/data_clean.csv"))

Variable overview

codebook_items(data_clean)

Codebook table

Subject information

  • subject_id: Unique participant ID
  • lang_comm: The bilingual language community to which the participant belongs; 2 levels: French-English & Spanish-English
  • gender: 2 levels: Female & Male
  • age_in_years: Participant’s age in years
  • age_in_months: Participant’s age in months
  • parent_edu_level: Highest education level of participant’s parent
  • global_exposure_eng: Parent-reported global exposure to English (out of 100%)
  • global_exposure_fr: Parent-reported global exposure to French (out of 100%)
  • global_exposure_sp: Parent-reported global exposure to Spanish (out of 100%)
  • global_exposure_other: Parent-reported global exposure to another language (out of 100%)
  • comprehension_proficiency_eng: Parent-reported English comprehension proficiency score (out of 10)
  • comprehension_proficiency_fr: Parent-reported French comprehension proficiency score (out of 10)
  • comprehension_proficiency_sp: Parent-reported Spanish comprehension proficiency score (out of 10)

Number of participants

This dataset included 35 French-English bilingual children (19 girls) and 27 Spanish-English bilingual children (13) who participated in our experiment.

Mean age (in months) of participant

data_clean %>%
  distinct(subject_id, .keep_all = T) %>%
  group_by(lang_comm) %>%
  summarize(mean_age_in_months = mean(age_in_months, na.rm = T),
            sd = sd(age_in_months, na.rm = T),
            min = min(age_in_months, na.rm = T),
            max = max(age_in_months, na.rm = T))
## # A tibble: 2 x 5
##   lang_comm       mean_age_in_months    sd   min   max
##   <chr>                        <dbl> <dbl> <dbl> <dbl>
## 1 French-English                49.0  7.12  36.5  60.6
## 2 Spanish-English               49.3  9.21  36.3  63.0

Distribution of parents’ education level

data_clean %>%
  select(subject_id, lang_comm, parent_edu_level) %>%
  distinct(subject_id, .keep_all = T) %>%
  group_by(lang_comm) %>%
  mutate(total_n = length(subject_id)) %>%
  select(lang_comm, parent_edu_level, total_n) %>%
  group_by(lang_comm, parent_edu_level) %>%
  mutate(n = length(parent_edu_level),
         percent = round(n/total_n*100)) %>%
  distinct(parent_edu_level, .keep_all = T)
## # A tibble: 12 x 5
## # Groups:   lang_comm, parent_edu_level [12]
##    lang_comm       parent_edu_level                      total_n     n percent
##    <chr>           <chr>                                   <int> <int>   <dbl>
##  1 French-English  Attestation of College Studies (AÉC)      35     1       3
##  2 French-English  College Certificate/Diploma                35     6      17
##  3 French-English  Master’s Degree                          35     8      23
##  4 French-English  Bachelor’s Degree                        35    13      37
##  5 French-English  Doctoral Degree                            35     3       9
##  6 French-English  Some College/University                    35     4      11
##  7 Spanish-English Bachelor’s Degree                        27     5      19
##  8 Spanish-English Master’s Degree                          27    14      52
##  9 Spanish-English Doctoral Degree                            27     2       7
## 10 Spanish-English College Certificate/Diploma                27     2       7
## 11 Spanish-English Some College/University                    27     1       4
## 12 Spanish-English <NA>                                       27     3      11

Mean proficiency

individual_exposure <- data_clean %>%
  select(subject_id, lang_comm, matches("exposure")) %>%
  distinct(subject_id, .keep_all = T) %>%
  mutate(global_exposure_fr_sp = case_when(lang_comm == "French-English" ~ as.numeric(global_exposure_fr),
                                           lang_comm == "Spanish-English" ~ as.numeric(global_exposure_sp),
                                           TRUE ~ NA_real_)) %>%
  select(subject_id, lang_comm, global_exposure_eng, global_exposure_fr_sp) %>%
  pivot_longer(-c(subject_id, lang_comm), names_to = "language", values_to = "exposure") %>%
  mutate(language = recode(language, "global_exposure_eng" = "English", "global_exposure_fr_sp" = "French/Spanish"))

data_clean %>%
  select(subject_id, lang_comm, matches("exposure")) %>%
  distinct(subject_id, .keep_all = T) %>%
  mutate(global_exposure_fr_sp = case_when(lang_comm == "French-English" ~ as.numeric(global_exposure_fr),
                                           lang_comm == "Spanish-English" ~ as.numeric(global_exposure_sp),
                                           TRUE ~ NA_real_)) %>%
  select(subject_id, lang_comm, global_exposure_eng, global_exposure_fr_sp) %>%
  pivot_longer(-c(subject_id, lang_comm), names_to = "language", values_to = "exposure") %>%
  mutate(language = recode(language, "global_exposure_eng" = "English", "global_exposure_fr_sp" = "French/Spanish")) %>%
  group_by(lang_comm, language) %>%
  summarize(mean_exposure = mean(exposure, na.rm = T),
            sd_exposure = sd(exposure, na.rm = T),
            min_exposure = min(exposure, na.rm = T),
            max_exposure = max(exposure, na.rm = T),
            se = sd_exposure / sqrt(n()),
            CI_lower = mean_exposure - (1.96 * se),
            CI_upper = mean_exposure + (1.96 * se)) %>%
  rename(exposure = mean_exposure) %>%
  ggplot(aes(x = lang_comm, y = exposure, fill = language, color = language)) +
  geom_bar(stat="identity", position=position_dodge(width=0.9), alpha = .25) +
  geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width=.2, position = position_dodge(.9), color = "#4d4d4d", alpha = 0.75) +
  geom_dotplot(data = individual_exposure, aes(y = exposure, x = lang_comm, fill = language, color = language), 
               binaxis = 'y', stackdir = 'center', position = position_dodge(0.9), stackratio = 1, dotsize = 0.5, alpha = 0.8) +
  coord_flip()

Exclusion criteria

  • missing_LEQ: 1 vs. 0; 1 = participant missing language comprehension proficiency data
data_clean %>%
  filter(missing_LEQ == 1) %>%
  distinct(subject_id) %>%
  nrow()
## [1] 4
  • exclude_preterm: 1 vs. 0; 1 = preterm
data_clean %>%
  filter(exclude_preterm == 1) %>%
  distinct(subject_id) %>%
  nrow()
## [1] 4
  • exclude_language_problem: 1 vs. 0; 1 = parent-reported language or speech development problem
data_clean %>%
  filter(exclude_language_problem == 1) %>%
  distinct(subject_id) %>%
  nrow()
## [1] 1
  • exclude_incomplete: 1 vs. 0; 1 = incomplete data
data_clean %>%
  filter(exclude_incomplete == 1) %>%
  distinct(subject_id) %>%
  nrow()
## [1] 1
  • exclude_parent: 1 vs. 0; 1 = parent-reported technical problems during the experiment
data_clean %>%
  filter(exclude_parent == 1) %>%
  distinct(subject_id) %>%
  nrow()
## [1] 1

Note that those paritcipants being flagged as 1 were excluded from the final analysis.

Experimental variables

  • test_order: Which of the 8 orders did the participant complete
    • English first, immediate-translation first (then French / then Spanish)
    • English first, one-language-at-a-time first (then French / then Spanish)
    • French first, immediate-translation first
    • French first, one-language-at-a-time first
    • Spanish first, immediate-translation first
    • Spanish first, one-language-at-a-time first
data_clean %>%
  distinct(test_order)
##   test_order
## 1      en_it
## 2      en_ol
## 3      fr_it
## 4      fr_ol
## 5      sp_it
## 6      sp_ol
  • block: 3 levels; familiar vs. block1 vs. block2
data_clean %>%
  distinct(block)
##      block
## 1 familiar
## 2   block1
## 3   block2
  • test_part: 3 levels; fam_test vs. nov_learn vs. nov_test
    • fam_test refers to the familiar word test trials
    • nov_learn refers to the novel word learning trials
    • nov_test refers to the novel word test trials
data_clean %>%
  distinct(test_part)
##   test_part
## 1  fam_test
## 2 nov_learn
## 3  nov_test
  • language: Which language the trial was in
data_clean %>%
  group_by(test_part) %>%
  distinct(language) %>%
  arrange(test_part)
## # A tibble: 13 x 2
## # Groups:   test_part [3]
##    test_part language       
##    <chr>     <chr>          
##  1 fam_test  english        
##  2 fam_test  french         
##  3 fam_test  spanish        
##  4 nov_learn english-french 
##  5 nov_learn english        
##  6 nov_learn french         
##  7 nov_learn french-english 
##  8 nov_learn english-spanish
##  9 nov_learn spanish        
## 10 nov_learn spanish-english
## 11 nov_test  english        
## 12 nov_test  french         
## 13 nov_test  spanish
  • condition: Which condition the trial was in
    • 3 levels; familiar vs. immediate_translation vs. one_language_at_a_time
data_clean %>%
  distinct(condition)
##                condition
## 1               familiar
## 2  immediate_translation
## 3 one_language_at_a_time
  • trial_index: Trial number
  • stimulus: Which stimulus was played in the trial
  • button_correct: The correct response for the trial
  • button_selected: Which response did the participant selected
  • rt: The response time for the trial
  • accuracy: If button_correct = button_selected (i.e., the participant selected the correct response), then a score of 1 was given to the trial; if not, a score of 0 was given. If no response was made, then NA was assigned.

Number of participants per test order

data_clean %>%
  select(subject_id, lang_comm, test_order) %>%
  distinct(subject_id, .keep_all = T) %>%
  group_by(lang_comm) %>%
  count(test_order) %>%
  ggplot(aes(x = lang_comm, y = n, fill = test_order)) +
  geom_bar(stat="identity", position=position_dodge(width=0.9)) +
  scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, by = 1))

Number of trials per participants

level_order <- c("familiar", "block1", "block2")

data_clean %>%
  group_by(subject_id, block) %>%
  filter(test_part == "fam_test" | test_part == "nov_test") %>%
  count() %>%
  ggplot(aes(x = subject_id, y = n, fill = factor(block, level = level_order))) +
  geom_bar(stat="identity", position=position_dodge(width=0.9)) +
  scale_y_continuous(limits = c(0, 12), breaks = seq(0, 12, by = 1)) +
  facet_grid(factor(block, level = level_order) ~ .) +
  coord_flip() +
  theme(text = element_text(size=12),
        axis.text = element_text(size = 12, color = "black"),
        legend.text = element_text(size = 12),
        legend.position="bottom")